started: Alexey Larionov, 2016
last updated: Alexey Larionov, 27Aug2017
Calculates AFs and HWE for 480 cases (before calculating eigenvectors)
Removes 684 variants violating HWE (p <10-4 : threshold recommended by EZ)
Some of these could be valid multiallelic variants - not verifyed at this occasion
Eigenvectors are calculated using 44,508 common variants only:
5% < AF < 95% in each of the compared datasets (UBC and CBC)
Requires accessory scripts f01_calculate_eigenvectors.R and f03_qqunif_plot.R
Suggests two eigenvectors’ outliers (> 6 SdtDev on 2nd EV): P5_E09 and P6_D05
Additionally there are two outliers along the 4th EV: P2_C08 and P4_F10
Input data: 239,642 vars x 480 cases (245 UBC and 235 CBC)
Output data: 238,958 vars x 480 cases (245 UBC and 235 CBC)
# Time stamp
Sys.time()
## [1] "2017-08-27 19:10:19 BST"
# Clenan-up
rm(list=ls())
# Libraries location
.libPaths("/home/alarionov/R/my_libs_r3.4.1/")
# Base folder
library(knitr)
base_folder="/analysis/mtgroup_share/users/alexey/wecare_only_08.17"
opts_knit$set(root.dir = base_folder)
#setwd(base_folder)
# Required libraries
library(ggplot2)
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(HardyWeinberg)
## Loading required package: mice
# Accessory functions
source(paste(base_folder, "scripts", "f03_qqunif_plot.R", sep="/"))
source(paste(base_folder, "scripts", "f01_calculate_eigenvectors.R", sep="/"))
# Filtering settings
hwe_th <- 0.0001
#base_folder="/analysis/mtgroup_share/users/alexey/wecare_only_08.17"
load(paste(base_folder, "results", "r04_filter_cases_and_variants_wecare_only.RData", sep="/"))
ls()
## [1] "base_folder"
## [2] "exac.df"
## [3] "genotypes.mx"
## [4] "hwe_th"
## [5] "kgen.df"
## [6] "normalise_and_calculate_eigenvectors.udf"
## [7] "phenotypes.df"
## [8] "qqunif.plot"
## [9] "variants.df"
dim(genotypes.mx)
## [1] 239642 480
class(genotypes.mx)
## [1] "matrix"
genotypes.mx[1:5,1:5]
## P1_A01 P1_A02 P1_A03 P1_A04 P1_A05
## Var000000001 0 1 0 0 0
## Var000000002 0 0 0 0 0
## Var000000008 0 0 NA 0 NA
## Var000000020 0 0 0 0 0
## Var000000021 0 0 0 0 0
dim(phenotypes.df)
## [1] 480 31
str(phenotypes.df)
## 'data.frame': 480 obs. of 31 variables:
## $ wes_id : chr "P1_A01" "P1_A02" "P1_A03" "P1_A04" ...
## $ gwas_id : chr "id203568" "id357807" "id325472" "id304354" ...
## $ merged_id : chr "P1_A01_id203568" "P1_A02_id357807" "P1_A03_id325472" "P1_A04_id304354" ...
## $ filter : chr "pass" "pass" "pass" "pass" ...
## $ cc : int 1 1 1 1 1 1 0 1 1 0 ...
## $ setno : int 203568 357807 325472 304354 222648 244843 276284 297810 250898 226974 ...
## $ registry : Factor w/ 5 levels "de","IA","IR",..: 3 4 2 2 5 4 2 4 4 2 ...
## $ family_history: int 1 0 1 1 1 1 1 1 1 0 ...
## $ age_dx : int 49 35 32 33 44 28 28 38 35 36 ...
## $ age_ref : num 58 36 41 34 49 28 32 44 35 38 ...
## $ rstime : num 10.17 1.83 9.75 1.59 5.66 ...
## $ eig1_gwas : num -0.00389 -0.00379 -0.01011 -0.01288 -0.01086 ...
## $ eig2_gwas : num 0.00266 0.00246 -0.0001 0.00595 0.01157 ...
## $ eig3_gwas : num 0.06803 0.05055 -0.00603 0.00747 0.00144 ...
## $ eig4_gwas : num -0.03469 -0.01264 -0.01269 -0.01841 0.00663 ...
## $ eig5_gwas : num -0.03845 -0.00424 0.01597 -0.0065 0.01391 ...
## $ stage : num 1 2 2 1 1 1 2 1 2 1 ...
## $ er : num NA 0 0 0 NA 1 1 1 1 0 ...
## $ pr : num NA 0 0 NA NA 1 NA 1 0 0 ...
## $ hist_cat : chr "lobular" "ductal" "medullary" "ductal" ...
## $ hormone : num 0 0 0 0 1 0 0 0 0 0 ...
## $ chemo_cat : chr "no" "CMF" "Oth" "no" ...
## $ br_xray : int 1 1 0 0 1 0 0 0 1 1 ...
## $ br_xray_dose : num 1.6 0.83 0 0 0.77 0 0 0 1.1 0.83 ...
## $ age_menarche : num 9 13 10 12 10 13 12 11 11 NA ...
## $ age_1st_ftp : num 30 0 26 0 17 0 25 28 27 18 ...
## $ age_menopause : num -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 ...
## $ num_preg : num 1 0 1 0 1 0 1 1 1 1 ...
## $ bmi_age18 : num 20.8 22.5 23.6 18.6 19.9 ...
## $ bmi_dx : num 25.8 22.9 28.3 17.8 26.6 ...
## $ bmi_ref : num 33.3 22.9 33.1 17.8 29.8 ...
phenotypes.df[1:5,1:5]
## wes_id gwas_id merged_id filter cc
## P1_A01 P1_A01 id203568 P1_A01_id203568 pass 1
## P1_A02 P1_A02 id357807 P1_A02_id357807 pass 1
## P1_A03 P1_A03 id325472 P1_A03_id325472 pass 1
## P1_A04 P1_A04 id304354 P1_A04_id304354 pass 1
## P1_A05 P1_A05 id222648 P1_A05_id222648 pass 1
dim(variants.df)
## [1] 239642 23
colnames(variants.df)
## [1] "SplitVarID" "SYMBOL" "TYPE"
## [4] "CHROM" "POS" "REF"
## [7] "ALT" "AC" "AF"
## [10] "AN" "Consequence" "SIFT_call"
## [13] "SIFT_score" "PolyPhen_call" "PolyPhen_score"
## [16] "CLIN_SIG" "cDNA_position" "CDS_position"
## [19] "Codons" "Protein_position" "Amino_acids"
## [22] "Existing_variation" "Multiallelic"
variants.df[1:5,1:5]
## SplitVarID SYMBOL TYPE CHROM POS
## Var000000001 Var000000001 RP11-206L10.1 INDEL 1 664486
## Var000000002 Var000000002 LINC00115 SNP 1 762330
## Var000000008 Var000000008 SAMD11 SNP 1 871215
## Var000000020 Var000000020 NOC2L SNP 1 880461
## Var000000021 Var000000021 NOC2L SNP 1 880483
dim(kgen.df)
## [1] 239642 9
colnames(kgen.df)
## [1] "SplitVarID" "kgen.AC" "kgen.AN" "kgen.AF" "kgen.AFR_AF"
## [6] "kgen.AMR_AF" "kgen.EAS_AF" "kgen.EUR_AF" "kgen.SAS_AF"
kgen.df[1:5,1:5]
## SplitVarID kgen.AC kgen.AN kgen.AF kgen.AFR_AF
## Var000000001 Var000000001 NA NA NA NA
## Var000000002 Var000000002 NA NA NA NA
## Var000000008 Var000000008 NA NA NA NA
## Var000000020 Var000000020 NA NA NA NA
## Var000000021 Var000000021 11 5008 0.00219649 8e-04
dim(exac.df)
## [1] 239642 48
colnames(exac.df)
## [1] "SplitVarID" "exac_non_TCGA.AF"
## [3] "exac_non_TCGA.AC" "exac_non_TCGA.AN"
## [5] "exac_non_TCGA.AC_FEMALE" "exac_non_TCGA.AN_FEMALE"
## [7] "exac_non_TCGA.AC_MALE" "exac_non_TCGA.AN_MALE"
## [9] "exac_non_TCGA.AC_Adj" "exac_non_TCGA.AN_Adj"
## [11] "exac_non_TCGA.AC_Hom" "exac_non_TCGA.AC_Het"
## [13] "exac_non_TCGA.AC_Hemi" "exac_non_TCGA.AC_AFR"
## [15] "exac_non_TCGA.AN_AFR" "exac_non_TCGA.Hom_AFR"
## [17] "exac_non_TCGA.Het_AFR" "exac_non_TCGA.Hemi_AFR"
## [19] "exac_non_TCGA.AC_AMR" "exac_non_TCGA.AN_AMR"
## [21] "exac_non_TCGA.Hom_AMR" "exac_non_TCGA.Het_AMR"
## [23] "exac_non_TCGA.Hemi_AMR" "exac_non_TCGA.AC_EAS"
## [25] "exac_non_TCGA.AN_EAS" "exac_non_TCGA.Hom_EAS"
## [27] "exac_non_TCGA.Het_EAS" "exac_non_TCGA.Hemi_EAS"
## [29] "exac_non_TCGA.AC_FIN" "exac_non_TCGA.AN_FIN"
## [31] "exac_non_TCGA.Hom_FIN" "exac_non_TCGA.Het_FIN"
## [33] "exac_non_TCGA.Hemi_FIN" "exac_non_TCGA.AC_NFE"
## [35] "exac_non_TCGA.AN_NFE" "exac_non_TCGA.Hom_NFE"
## [37] "exac_non_TCGA.Het_NFE" "exac_non_TCGA.Hemi_NFE"
## [39] "exac_non_TCGA.AC_SAS" "exac_non_TCGA.AN_SAS"
## [41] "exac_non_TCGA.Hom_SAS" "exac_non_TCGA.Het_SAS"
## [43] "exac_non_TCGA.Hemi_SAS" "exac_non_TCGA.AC_OTH"
## [45] "exac_non_TCGA.AN_OTH" "exac_non_TCGA.Hom_OTH"
## [47] "exac_non_TCGA.Het_OTH" "exac_non_TCGA.Hemi_OTH"
exac.df[1:5,1:5]
## SplitVarID exac_non_TCGA.AF exac_non_TCGA.AC
## Var000000001 Var000000001 NA NA
## Var000000002 Var000000002 1.200e-02 175
## Var000000008 Var000000008 NA NA
## Var000000020 Var000000020 2.825e-05 3
## Var000000021 Var000000021 1.422e-03 151
## exac_non_TCGA.AN exac_non_TCGA.AC_FEMALE
## Var000000001 NA NA
## Var000000002 14012 59
## Var000000008 NA NA
## Var000000020 106200 0
## Var000000021 106186 60
# Check consistency of colnames and rownames
sum(colnames(genotypes.mx) != rownames(phenotypes.df))
## [1] 0
sum(rownames(genotypes.mx) != rownames(variants.df))
## [1] 0
sum(rownames(genotypes.mx) != rownames(kgen.df))
## [1] 0
sum(rownames(genotypes.mx) != rownames(exac.df))
## [1] 0
Used for selecting common variants for eigenvectors computation.
Will be recalculated later after exclusion of eigenvectors outliers.
# Rename AF fields in the variants table
vars_colnames <- colnames(variants.df)
"ac_raw" -> vars_colnames[ vars_colnames == "AC" ]
"an_raw" -> vars_colnames[ vars_colnames == "AN" ]
"af_raw" -> vars_colnames[ vars_colnames == "AF" ]
vars_colnames -> colnames(variants.df)
# Function to calculate AN
get_allele_number.udf <- function(x){2*sum(!is.na(x))}
# --- Calculate total AFs --- #
# Calculate total ac, an and af
ac_all <- apply(genotypes.mx, 1, sum, na.rm=TRUE)
an_all <- apply(genotypes.mx, 1, get_allele_number.udf)
af_all <- ac_all/an_all
# Add new AFs to the variants table
variants.df <- cbind(variants.df, ac_all, an_all, af_all)
# --- Calculate ubc AFs --- #
# Prepare genotypes table
ubc_cases <- phenotypes.df$cc == 0
sum(ubc_cases) # 245
## [1] 245
ubc_genotypes.mx <- genotypes.mx[,ubc_cases]
dim(ubc_genotypes.mx)
## [1] 239642 245
# Calculate ubc ac, an and af
ac_ubc <- apply(ubc_genotypes.mx, 1, sum, na.rm=TRUE)
an_ubc <- apply(ubc_genotypes.mx, 1, get_allele_number.udf)
af_ubc <- ac_ubc/an_ubc
# Add updated AFs to variants.df
variants.df <- cbind(variants.df, ac_ubc, an_ubc, af_ubc)
# --- Calculate_cbc_AFs --- #
# Prepare genotypes table
cbc_cases <- phenotypes.df$cc == 1
sum(cbc_cases) # 235
## [1] 235
cbc_genotypes.mx <- genotypes.mx[,cbc_cases]
dim(cbc_genotypes.mx)
## [1] 239642 235
# Calculate cbc ac, an and af
ac_cbc <- apply(cbc_genotypes.mx, 1, sum, na.rm=TRUE)
an_cbc <- apply(cbc_genotypes.mx, 1, get_allele_number.udf)
af_cbc <- ac_cbc/an_cbc
# Add updated AFs to variants.df
variants.df <- cbind(variants.df, ac_cbc, an_cbc, af_cbc)
# Clean-up
rm(vars_colnames, get_allele_number.udf, ac_all, an_all, af_all,
cbc_cases, cbc_genotypes.mx, ac_cbc, an_cbc, af_cbc,
ubc_cases, ubc_genotypes.mx, ac_ubc, an_ubc, af_ubc)
Using library HardyWeinberg
# Prepare genotypes counts
genotypes_counts <- MakeCounts(t(genotypes.mx),coding=c(0,1,2))
dim(genotypes_counts)
## [1] 239642 4
genotypes_counts[1:10,]
## AA AB BB NA
## [1,] 427 44 0 9
## [2,] 471 4 0 5
## [3,] 414 3 0 63
## [4,] 477 1 0 2
## [5,] 478 1 0 1
## [6,] 478 1 0 1
## [7,] 55 211 185 29
## [8,] 463 1 0 16
## [9,] 457 1 0 22
## [10,] 471 2 0 7
# Calculate HW p-values
hwe <- HWExactStats(genotypes_counts[,1:3], verbose=FALSE)
hwe[1:10]
## [1] 0.6145350 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 0.7578755
## [8] 1.0000000 1.0000000 1.0000000
variants.df <- cbind(variants.df, hwe)
# Select common variants (for QQ plot only)
common_variants <- variants.df$af_all > 0.05 & variants.df$af_all < 0.95
sum(common_variants) # 46,795
## [1] 46795
# Bonferroni-style threshold - too relaxed (EZ)
1/length(hwe) # ~4e-6
## [1] 4.172891e-06
hwe_violators <- hwe < 1/length(hwe)
sum(hwe_violators) # 605
## [1] 605
# A stronger conventional threshold (10-4, EZ recommended between 5e-4 5e-5)
hwe_violators <- hwe < hwe_th
sum(hwe_violators) # 684
## [1] 684
# QQ-plots for HWE
qqunif.plot(hwe,
main="QQ plot for all HWE p-values")
## Loading required package: grid
qqunif.plot(hwe[!hwe_violators],
main=paste("QQ plot for HWE p-values",
"\n excluding HWE violaters (p<10-4)"))
qqunif.plot(hwe[!hwe_violators & common_variants],
main=paste("QQ plot for HWE p-values",
"\n excluding HWE violaters (p<10-4) and rare variants (MAF<5%)"))
# Remove variants violating HWE
variants.df <- variants.df[!hwe_violators,]
genotypes.mx <- genotypes.mx[!hwe_violators,]
kgen.df <- kgen.df[!hwe_violators,]
exac.df <- exac.df[!hwe_violators,]
# Check results
dim(variants.df)
## [1] 238958 33
# Clean-up
rm(genotypes_counts, hwe, hwe_violators, common_variants, qqunif.plot, hwe_th)
Requires source(“f01_calculate_eigenvectors.R”)
Only common variants (0.05 < AF < 0.95 in both CBC and UBC) will be used for eigenvectors calculation.
Note exclusion on both sides: low- and high- AFs:
- Low AFs remove rare variants with common allele in reference genome
- Hight AFs remove rare variants with common allele in reference genome
# --- Make subset of data with common variants only
cbc_common_vars <- variants.df$af_cbc > 0.05 & variants.df$af_cbc < 0.95
sum(cbc_common_vars) # 46,135
## [1] 46135
ubc_common_vars <- variants.df$af_ubc > 0.05 & variants.df$af_ubc < 0.95
sum(ubc_common_vars) # 46,076
## [1] 46076
common_overlap_vars <- cbc_common_vars & ubc_common_vars
sum(common_overlap_vars) # 44,508
## [1] 44508
min(variants.df$af_all[common_overlap_vars]) # ~0.05
## [1] 0.0503212
max(variants.df$af_all[common_overlap_vars]) # ~0.95
## [1] 0.9487179
common_overlap_genotypes.mx <- genotypes.mx[common_overlap_vars,]
dim(common_overlap_genotypes.mx)
## [1] 44508 480
common_overlap_genotypes.mx[1:5,1:5]
## P1_A01 P1_A02 P1_A03 P1_A04 P1_A05
## Var000000024 2 1 NA 1 2
## Var000000054 2 2 NA 2 NA
## Var000000058 2 2 2 2 2
## Var000000062 0 0 0 0 0
## Var000000109 0 1 0 1 0
# --- Calculate eigenvectors --- #
wecare.eigen <- normalise_and_calculate_eigenvectors.udf(common_overlap_genotypes.mx)
# Clean-up
rm(cbc_common_vars, ubc_common_vars, common_overlap_vars,
common_overlap_genotypes.mx, normalise_and_calculate_eigenvectors.udf)
Note manually coded varaintsxsamples values in figure titles:
need to be corrected manually, if changed
# --- Prepare data for plotting --- #
wecare.eigenvectors.df <- as.data.frame(wecare.eigen$vectors) # eigenvectors in columns
# Prepare colour scale
colours <- c("UBC" = "BLUE", "CBC" = "RED")
userColourScale <- scale_colour_manual(values=colours)
# Prepare cases lables
cases_labels <- as.vector(phenotypes.df$cc)
"CBC" -> cases_labels[cases_labels==1]
"UBC" -> cases_labels[cases_labels==0]
summary(as.factor(cases_labels))
## CBC UBC
## 235 245
# Prepare cases IDs (for labels on onteractive plot)
cases_IDs <- as.vector(phenotypes.df$wes_id)
# make the dataframe
data2plot.df <- cbind(cases_IDs, cases_labels, wecare.eigenvectors.df[,1:5])
colnames(data2plot.df) <- c("wes_id", "group", "ev1", "ev2", "ev3", "ev4", "ev5")
# --- Plot eig1 vs eig2 --- #
g <- ggplot(data2plot.df, aes(ev1, ev2)) +
geom_point(aes(colour=group, fill=group, text = wes_id)) +
labs(title="wecare common variants<br>(44,508 x 480)", x ="eigenvector1", y = "eigenvector2") +
userColourScale
## Warning: Ignoring unknown aesthetics: text
ggplotly(g)
## We recommend that you use the dev version of ggplot2 with `ggplotly()`
## Install it with: `devtools::install_github('hadley/ggplot2')`
# --- Plot eig2 vs eig3 --- #
g <- ggplot(data2plot.df, aes(ev2, ev3)) +
geom_point(aes(colour=group, fill=group, text = wes_id)) +
labs(title="wecare common variants<br>(44,508 x 480)", x ="eigenvector2", y = "eigenvector3") +
userColourScale
## Warning: Ignoring unknown aesthetics: text
ggplotly(g)
## We recommend that you use the dev version of ggplot2 with `ggplotly()`
## Install it with: `devtools::install_github('hadley/ggplot2')`
# --- Plot eig3 vs eig4 --- #
g <- ggplot(data2plot.df, aes(ev3, ev4)) +
geom_point(aes(colour=group, fill=group, text = wes_id)) +
labs(title="wecare common variants<br>(44,508 x 480)", x ="eigenvector3", y = "eigenvector4") +
userColourScale
## Warning: Ignoring unknown aesthetics: text
ggplotly(g)
## We recommend that you use the dev version of ggplot2 with `ggplotly()`
## Install it with: `devtools::install_github('hadley/ggplot2')`
# --- Plot eig4 vs eig5 --- #
g <- ggplot(data2plot.df, aes(ev4, ev5)) +
geom_point(aes(colour=group, fill=group, text = wes_id)) +
labs(title="wecare common variants<br>(44,508 x 480)", x ="eigenvector4", y = "eigenvector5") +
userColourScale
## Warning: Ignoring unknown aesthetics: text
ggplotly(g)
## We recommend that you use the dev version of ggplot2 with `ggplotly()`
## Install it with: `devtools::install_github('hadley/ggplot2')`
# --- Clean-up --- #
rm(wecare.eigenvectors.df, g, data2plot.df, cases_IDs, cases_labels, colours, userColourScale)
Explore 6 standard deviations in 5 top eigenvectors
wecare.eigenvectors.mx <- wecare.eigen$vectors # eigenvectors in columns
# No outliers on 1st ev
ev1 <- wecare.eigenvectors.mx[,1]
ev1.positive_outliers <- ev1 > mean(ev1) + 6 * sd(ev1)
ev1.negative_outliers <- ev1 < mean(ev1) - 6 * sd(ev1)
sum(ev1.positive_outliers)
## [1] 0
sum(ev1.negative_outliers)
## [1] 0
phenotypes.df$wes_id[ev1.positive_outliers]
## character(0)
phenotypes.df$wes_id[ev1.negative_outliers]
## character(0)
# 2 outliers on 2nd ev: P5_E09 and P6_D05
ev2 <- wecare.eigenvectors.mx[,2]
ev2.positive_outliers <- ev2 > mean(ev2) + 6 * sd(ev2)
ev2.negative_outliers <- ev2 < mean(ev2) - 6 * sd(ev2)
sum(ev2.positive_outliers)
## [1] 0
sum(ev2.negative_outliers)
## [1] 2
phenotypes.df$wes_id[ev2.positive_outliers]
## character(0)
phenotypes.df$wes_id[ev2.negative_outliers]
## [1] "P5_E09" "P6_D05"
# No outliers on 3rd ev
ev3 <- wecare.eigenvectors.mx[,3]
ev3.positive_outliers <- ev3 > mean(ev3) + 6 * sd(ev3)
ev3.negative_outliers <- ev3 < mean(ev3) - 6 * sd(ev3)
sum(ev3.positive_outliers)
## [1] 0
sum(ev3.negative_outliers)
## [1] 0
phenotypes.df$wes_id[ev3.positive_outliers]
## character(0)
phenotypes.df$wes_id[ev3.negative_outliers]
## character(0)
# 2 outliers on 4th ev: P2_C08 and P4_F10
ev4 <- wecare.eigenvectors.mx[,4]
ev4.positive_outliers <- ev4 > mean(ev4) + 6 * sd(ev4)
ev4.negative_outliers <- ev4 < mean(ev4) - 6 * sd(ev4)
sum(ev4.positive_outliers)
## [1] 2
sum(ev4.negative_outliers)
## [1] 0
phenotypes.df$wes_id[ev4.positive_outliers]
## [1] "P2_C08" "P4_F10"
phenotypes.df$wes_id[ev4.negative_outliers]
## character(0)
# No outliers on 5th ev
ev5 <- wecare.eigenvectors.mx[,5]
ev5.positive_outliers <- ev5 > mean(ev5) + 6 * sd(ev5)
ev5.negative_outliers <- ev5 < mean(ev5) - 6 * sd(ev5)
sum(ev5.positive_outliers)
## [1] 0
sum(ev5.negative_outliers)
## [1] 0
phenotypes.df$wes_id[ev5.positive_outliers]
## character(0)
phenotypes.df$wes_id[ev5.negative_outliers]
## character(0)
# Plot eigenvalues
plot(wecare.eigen$values, main="Eigenvalues, wecare only")
wecare.eigen$values[1:10]
## [1] 5.090430 4.072666 3.682725 3.174499 3.096095 3.086285 2.999579
## [8] 2.981127 2.946367 2.919219
# Clean-up
rm(wecare.eigenvectors.mx,
ev1, ev1.positive_outliers, ev1.negative_outliers,
ev2, ev2.positive_outliers, ev2.negative_outliers,
ev3, ev3.positive_outliers, ev3.negative_outliers,
ev4, ev4.positive_outliers, ev4.negative_outliers,
ev5, ev5.positive_outliers, ev5.negative_outliers)
ls()
## [1] "base_folder" "exac.df" "genotypes.mx" "kgen.df"
## [5] "phenotypes.df" "variants.df" "wecare.eigen"
dim(genotypes.mx)
## [1] 238958 480
class(genotypes.mx)
## [1] "matrix"
genotypes.mx[1:5,1:5]
## P1_A01 P1_A02 P1_A03 P1_A04 P1_A05
## Var000000001 0 1 0 0 0
## Var000000002 0 0 0 0 0
## Var000000008 0 0 NA 0 NA
## Var000000020 0 0 0 0 0
## Var000000021 0 0 0 0 0
dim(phenotypes.df)
## [1] 480 31
str(phenotypes.df)
## 'data.frame': 480 obs. of 31 variables:
## $ wes_id : chr "P1_A01" "P1_A02" "P1_A03" "P1_A04" ...
## $ gwas_id : chr "id203568" "id357807" "id325472" "id304354" ...
## $ merged_id : chr "P1_A01_id203568" "P1_A02_id357807" "P1_A03_id325472" "P1_A04_id304354" ...
## $ filter : chr "pass" "pass" "pass" "pass" ...
## $ cc : int 1 1 1 1 1 1 0 1 1 0 ...
## $ setno : int 203568 357807 325472 304354 222648 244843 276284 297810 250898 226974 ...
## $ registry : Factor w/ 5 levels "de","IA","IR",..: 3 4 2 2 5 4 2 4 4 2 ...
## $ family_history: int 1 0 1 1 1 1 1 1 1 0 ...
## $ age_dx : int 49 35 32 33 44 28 28 38 35 36 ...
## $ age_ref : num 58 36 41 34 49 28 32 44 35 38 ...
## $ rstime : num 10.17 1.83 9.75 1.59 5.66 ...
## $ eig1_gwas : num -0.00389 -0.00379 -0.01011 -0.01288 -0.01086 ...
## $ eig2_gwas : num 0.00266 0.00246 -0.0001 0.00595 0.01157 ...
## $ eig3_gwas : num 0.06803 0.05055 -0.00603 0.00747 0.00144 ...
## $ eig4_gwas : num -0.03469 -0.01264 -0.01269 -0.01841 0.00663 ...
## $ eig5_gwas : num -0.03845 -0.00424 0.01597 -0.0065 0.01391 ...
## $ stage : num 1 2 2 1 1 1 2 1 2 1 ...
## $ er : num NA 0 0 0 NA 1 1 1 1 0 ...
## $ pr : num NA 0 0 NA NA 1 NA 1 0 0 ...
## $ hist_cat : chr "lobular" "ductal" "medullary" "ductal" ...
## $ hormone : num 0 0 0 0 1 0 0 0 0 0 ...
## $ chemo_cat : chr "no" "CMF" "Oth" "no" ...
## $ br_xray : int 1 1 0 0 1 0 0 0 1 1 ...
## $ br_xray_dose : num 1.6 0.83 0 0 0.77 0 0 0 1.1 0.83 ...
## $ age_menarche : num 9 13 10 12 10 13 12 11 11 NA ...
## $ age_1st_ftp : num 30 0 26 0 17 0 25 28 27 18 ...
## $ age_menopause : num -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 ...
## $ num_preg : num 1 0 1 0 1 0 1 1 1 1 ...
## $ bmi_age18 : num 20.8 22.5 23.6 18.6 19.9 ...
## $ bmi_dx : num 25.8 22.9 28.3 17.8 26.6 ...
## $ bmi_ref : num 33.3 22.9 33.1 17.8 29.8 ...
phenotypes.df[1:5,1:5]
## wes_id gwas_id merged_id filter cc
## P1_A01 P1_A01 id203568 P1_A01_id203568 pass 1
## P1_A02 P1_A02 id357807 P1_A02_id357807 pass 1
## P1_A03 P1_A03 id325472 P1_A03_id325472 pass 1
## P1_A04 P1_A04 id304354 P1_A04_id304354 pass 1
## P1_A05 P1_A05 id222648 P1_A05_id222648 pass 1
dim(variants.df)
## [1] 238958 33
colnames(variants.df)
## [1] "SplitVarID" "SYMBOL" "TYPE"
## [4] "CHROM" "POS" "REF"
## [7] "ALT" "ac_raw" "af_raw"
## [10] "an_raw" "Consequence" "SIFT_call"
## [13] "SIFT_score" "PolyPhen_call" "PolyPhen_score"
## [16] "CLIN_SIG" "cDNA_position" "CDS_position"
## [19] "Codons" "Protein_position" "Amino_acids"
## [22] "Existing_variation" "Multiallelic" "ac_all"
## [25] "an_all" "af_all" "ac_ubc"
## [28] "an_ubc" "af_ubc" "ac_cbc"
## [31] "an_cbc" "af_cbc" "hwe"
variants.df[1:5,1:5]
## SplitVarID SYMBOL TYPE CHROM POS
## Var000000001 Var000000001 RP11-206L10.1 INDEL 1 664486
## Var000000002 Var000000002 LINC00115 SNP 1 762330
## Var000000008 Var000000008 SAMD11 SNP 1 871215
## Var000000020 Var000000020 NOC2L SNP 1 880461
## Var000000021 Var000000021 NOC2L SNP 1 880483
dim(kgen.df)
## [1] 238958 9
colnames(kgen.df)
## [1] "SplitVarID" "kgen.AC" "kgen.AN" "kgen.AF" "kgen.AFR_AF"
## [6] "kgen.AMR_AF" "kgen.EAS_AF" "kgen.EUR_AF" "kgen.SAS_AF"
kgen.df[1:5,1:5]
## SplitVarID kgen.AC kgen.AN kgen.AF kgen.AFR_AF
## Var000000001 Var000000001 NA NA NA NA
## Var000000002 Var000000002 NA NA NA NA
## Var000000008 Var000000008 NA NA NA NA
## Var000000020 Var000000020 NA NA NA NA
## Var000000021 Var000000021 11 5008 0.00219649 8e-04
dim(exac.df)
## [1] 238958 48
colnames(exac.df)
## [1] "SplitVarID" "exac_non_TCGA.AF"
## [3] "exac_non_TCGA.AC" "exac_non_TCGA.AN"
## [5] "exac_non_TCGA.AC_FEMALE" "exac_non_TCGA.AN_FEMALE"
## [7] "exac_non_TCGA.AC_MALE" "exac_non_TCGA.AN_MALE"
## [9] "exac_non_TCGA.AC_Adj" "exac_non_TCGA.AN_Adj"
## [11] "exac_non_TCGA.AC_Hom" "exac_non_TCGA.AC_Het"
## [13] "exac_non_TCGA.AC_Hemi" "exac_non_TCGA.AC_AFR"
## [15] "exac_non_TCGA.AN_AFR" "exac_non_TCGA.Hom_AFR"
## [17] "exac_non_TCGA.Het_AFR" "exac_non_TCGA.Hemi_AFR"
## [19] "exac_non_TCGA.AC_AMR" "exac_non_TCGA.AN_AMR"
## [21] "exac_non_TCGA.Hom_AMR" "exac_non_TCGA.Het_AMR"
## [23] "exac_non_TCGA.Hemi_AMR" "exac_non_TCGA.AC_EAS"
## [25] "exac_non_TCGA.AN_EAS" "exac_non_TCGA.Hom_EAS"
## [27] "exac_non_TCGA.Het_EAS" "exac_non_TCGA.Hemi_EAS"
## [29] "exac_non_TCGA.AC_FIN" "exac_non_TCGA.AN_FIN"
## [31] "exac_non_TCGA.Hom_FIN" "exac_non_TCGA.Het_FIN"
## [33] "exac_non_TCGA.Hemi_FIN" "exac_non_TCGA.AC_NFE"
## [35] "exac_non_TCGA.AN_NFE" "exac_non_TCGA.Hom_NFE"
## [37] "exac_non_TCGA.Het_NFE" "exac_non_TCGA.Hemi_NFE"
## [39] "exac_non_TCGA.AC_SAS" "exac_non_TCGA.AN_SAS"
## [41] "exac_non_TCGA.Hom_SAS" "exac_non_TCGA.Het_SAS"
## [43] "exac_non_TCGA.Hemi_SAS" "exac_non_TCGA.AC_OTH"
## [45] "exac_non_TCGA.AN_OTH" "exac_non_TCGA.Hom_OTH"
## [47] "exac_non_TCGA.Het_OTH" "exac_non_TCGA.Hemi_OTH"
exac.df[1:5,1:5]
## SplitVarID exac_non_TCGA.AF exac_non_TCGA.AC
## Var000000001 Var000000001 NA NA
## Var000000002 Var000000002 1.200e-02 175
## Var000000008 Var000000008 NA NA
## Var000000020 Var000000020 2.825e-05 3
## Var000000021 Var000000021 1.422e-03 151
## exac_non_TCGA.AN exac_non_TCGA.AC_FEMALE
## Var000000001 NA NA
## Var000000002 14012 59
## Var000000008 NA NA
## Var000000020 106200 0
## Var000000021 106186 60
str(wecare.eigen)
## List of 2
## $ values : num [1:480] 5.09 4.07 3.68 3.17 3.1 ...
## $ vectors: num [1:480, 1:480] -0.15651 -0.14148 -0.00298 -0.00799 0.0193 ...
## - attr(*, "class")= chr "eigen"
sum(colnames(genotypes.mx) != rownames(phenotypes.df))
## [1] 0
sum(rownames(genotypes.mx) != rownames(variants.df))
## [1] 0
sum(rownames(genotypes.mx) != rownames(kgen.df))
## [1] 0
sum(rownames(genotypes.mx) != rownames(exac.df))
## [1] 0
#base_folder="/analysis/mtgroup_share/users/alexey/wecare_only_08.17"
save.image(paste(base_folder, "results", "r05_calculate_hw_and_egenvectors_wecare_only.RData", sep="/"))
ls()
## [1] "base_folder" "exac.df" "genotypes.mx" "kgen.df"
## [5] "phenotypes.df" "variants.df" "wecare.eigen"
sessionInfo()
## R version 3.4.1 (2017-06-30)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 16.04.2 LTS
##
## Matrix products: default
## BLAS: /usr/lib/libblas/libblas.so.3.6.0
## LAPACK: /usr/lib/lapack/liblapack.so.3.6.0
##
## locale:
## [1] LC_CTYPE=en_GB.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_GB.UTF-8 LC_COLLATE=en_GB.UTF-8
## [5] LC_MONETARY=en_GB.UTF-8 LC_MESSAGES=en_GB.UTF-8
## [7] LC_PAPER=en_GB.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] grid stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] lattice_0.20-35 HardyWeinberg_1.5.8 mice_2.30
## [4] plotly_4.7.1 ggplot2_2.2.1 knitr_1.16
##
## loaded via a namespace (and not attached):
## [1] Rcpp_0.12.11 compiler_3.4.1 plyr_1.8.4
## [4] bindr_0.1 tools_3.4.1 rpart_4.1-11
## [7] digest_0.6.12 jsonlite_1.5 evaluate_0.10.1
## [10] tibble_1.3.3 gtable_0.2.0 viridisLite_0.2.0
## [13] pkgconfig_2.0.1 rlang_0.1.1 Matrix_1.2-10
## [16] shiny_1.0.5 crosstalk_1.0.0 yaml_2.1.14
## [19] bindrcpp_0.2 dplyr_0.7.1 stringr_1.2.0
## [22] httr_1.3.1 htmlwidgets_0.9 nnet_7.3-12
## [25] rprojroot_1.2 glue_1.1.1 data.table_1.10.4
## [28] R6_2.2.2 survival_2.41-3 rmarkdown_1.6
## [31] purrr_0.2.3 tidyr_0.7.0 magrittr_1.5
## [34] splines_3.4.1 backports_1.1.0 scales_0.4.1
## [37] htmltools_0.3.6 MASS_7.3-47 assertthat_0.2.0
## [40] xtable_1.8-2 mime_0.5 colorspace_1.3-2
## [43] httpuv_1.3.5 labeling_0.3 stringi_1.1.5
## [46] lazyeval_0.2.0 munsell_0.4.3
Sys.time()
## [1] "2017-08-27 19:11:50 BST"